Check the difference between the eGenes from Yazar et al. (2022) and eGenes identified by CSeQTL from GTEx whole blood RNA-seq data.

Setup

Load R libraries

library(data.table)
library(readxl)
library(stringr)
library(ggplot2)
library(ggpubr)
library(dplyr)
library(reshape2)
theme_set(theme_classic())

params = list(cs_q_cutoff=5e-3, fold_cutoff=1)
cs_q_cutoff = params$cs_q_cutoff
fold_cutoff = params$fold_cutoff

cat("cs_q_cutoff: ",  cs_q_cutoff,  "\n")
## cs_q_cutoff:  0.005
cat("fold_cutoff: ",  fold_cutoff,  "\n")
## fold_cutoff:  1
config = sprintf("cs_q_%.0e_fold_%.1f", 
                 cs_q_cutoff, fold_cutoff)
config
## [1] "cs_q_5e-03_fold_1.0"

Data directory and file names

CSeQTL_dir     = "../../CSeQTL/data2share/"
GTEx_eqtl_file = "GTExBlood_eqtls.rds"

Yazar_2022_dir = "../../CSeQTL/Yazar_2022/"

Yazar_2022_kept_folder = "Yazar_eSNP1_50kb/"
Yazar_2022_genes       = "Yazar_gene_considered.rds"

Read data/results and make comparison

Read eGenes for comparison.

fnm = sprintf("results/step3_%s_eGenes.rds", config)
eGenes_overlap_list = readRDS(fnm)
sapply(eGenes_overlap_list, function(x){
  sapply(x, function(z){sapply(z, length)})})
##      B IN B Mem CD4 NC CD4 ET CD8 NC CD8 ET CD8 S100B Mono C Mono NC NK R  NK
## [1,]  291   293    336    363    327    340       288    281     213  101 371
## [2,]  131   128    198    194    178    180       174    265     226  211 261

Cell type names

cs_cell_types = c("B", "CD4T", "CD8T", "Monocyte", "NK")

ot_cell_types = c("B IN", "B Mem", "CD4 NC", "CD4 ET", 
                     "CD8 NC", "CD8 ET", "CD8 S100B", "Mono C", 
                     "Mono NC", "NK R", "NK")

ct_groups = list(B = c("B IN", "B Mem"), 
                 CD4T = c("CD4 NC", "CD4 ET"),
                 CD8T = c("CD8 NC", "CD8 ET", "CD8 S100B"),
                 Monocyte = c("Mono C", "Mono NC"),
                 NK = c("NK R", "NK"))

Read in Yazar 2022 results

plist = list()

for (i in 1:length(ot_cell_types)){
  
  ot_ct = ot_cell_types[i]
  ot_ct_str = sub(" ", "_", str_squish(sub("/", "", ot_ct)))
  
  Yazar_ct_eGenes_path = paste0(Yazar_2022_dir, Yazar_2022_kept_folder, 
                            "Yazar_eSNP1_50kb_", ot_ct_str, ".csv")
  
  df_Yazar_ct  = read.csv(Yazar_ct_eGenes_path, header = TRUE)
  
  mat_i = match(eGenes_overlap_list[[ot_ct]][[1]]$Yazar, 
                df_Yazar_ct$Gene.Ensembl.ID)
  stopifnot(sum(is.na(mat_i)) == 0)
  
  df_Yazar_ct = df_Yazar_ct[mat_i,]
  df_Yazar_ct$CSeQTL = 
    df_Yazar_ct$Gene.Ensembl.ID %in% eGenes_overlap_list[[ot_ct]][[1]]$CSeQTL
  table(df_Yazar_ct$CSeQTL)
  
  ks1 = ks.test(log10(df_Yazar_ct$dist[!df_Yazar_ct$CSeQTL] + 1), 
                log10(df_Yazar_ct$dist[df_Yazar_ct$CSeQTL]+1))
  
  plist[[3*(i-1)+1]] = ggplot(df_Yazar_ct, 
                              aes(x=CSeQTL, y=log10(dist+1), color=CSeQTL)) + 
      geom_violin(trim=FALSE) + geom_boxplot(width=0.1) + 
    ggtitle(sprintf("%s, %d vs. %d", ot_ct, sum(!df_Yazar_ct$CSeQTL), 
                    sum(df_Yazar_ct$CSeQTL)), 
            subtitle=sprintf("KS test p-val=%.1e", ks1$p.value))
  
  df_Yazar_ct$qvalue[which(df_Yazar_ct$qvalue < 1e-50)] = 1e-50

  ks1 = ks.test(-log10(df_Yazar_ct$qvalue)[!df_Yazar_ct$CSeQTL], 
                -log10(df_Yazar_ct$qvalue)[df_Yazar_ct$CSeQTL])

  
  plist[[3*(i-1)+2]] = ggplot(df_Yazar_ct, aes(x=CSeQTL, y=-log10(qvalue), 
                                               color=CSeQTL)) + 
      geom_violin(trim=FALSE) + geom_boxplot(width=0.1) + 
    ggtitle(label="", subtitle = sprintf("KS test p-val=%.1e", ks1$p.value))

  ks1 = ks.test(-log10(df_Yazar_ct$FDR)[!df_Yazar_ct$CSeQTL], 
                -log10(df_Yazar_ct$FDR)[df_Yazar_ct$CSeQTL])

  plist[[3*(i-1)+3]] = ggplot(df_Yazar_ct, aes(x=CSeQTL, 
                                               y=-log10(FDR), 
                                               color=CSeQTL)) + 
      geom_violin(trim=FALSE) + geom_boxplot(width=0.1) + 
    ggtitle(label="", subtitle = sprintf("KS test p-val=%.1e", ks1$p.value))

}

length(plist)
## [1] 33
ggarrange(plotlist=plist[1:12], nrow = 4, ncol = 3, 
          common.legend=TRUE, legend="top")

ggarrange(plotlist=plist[13:21], nrow = 4, ncol = 3, 
          common.legend=TRUE, legend="top")

ggarrange(plotlist=plist[22:33], nrow = 4, ncol = 3, 
          common.legend=TRUE, legend="top")

Read in CSeQTL results

GTExWB <- readRDS(paste0(CSeQTL_dir, GTEx_eqtl_file))
length(GTExWB)
## [1] 2
names(GTExWB)
## [1] "eqtl"    "summary"
lapply(GTExWB, dim)
## $eqtl
## [1] 759684     18
## 
## $summary
## [1] 64 13
GTExWB = GTExWB[["eqtl"]]
dim(GTExWB)
## [1] 759684     18
GTExWB[1:2,]
##    chr               ENSG  GENE CELLTYPE                    SNP      MUa
## 1 chr1 ENSG00000000457.13 SCYL3     Bulk chr1:169796199:G:A:b38 181.5319
## 2 chr1 ENSG00000000457.13 SCYL3     Bulk chr1:169891332:G:A:b38       NA
##          ETA         PVAL CISTRANS  trim  perm   AF  MODEL nSNP nIND
## 1  0.7871129 6.937226e-66      CIS FALSE FALSE NULL cseqtl 2969 2969
## 2 -0.2600950 6.222035e-32     <NA> FALSE FALSE NULL    ols 2969 2969
##   permutation_PVAL       qvalue  CT2
## 1     2.059662e-62 2.510377e-62 <NA>
## 2     1.847322e-28 5.945281e-28 <NA>
table(GTExWB$CELLTYPE, useNA = "ifany")
## 
##          B       Bulk       CD4T       CD8T   Monocyte Neutrophil         NK 
##      94437      96138      94779      94419      94611      95492      94713 
##      OTHER 
##      95095
GTExWB$ENSG_version = GTExWB$ENSG
GTExWB$ENSG = gsub("\\..*","", GTExWB$ENSG_version)

GTExWB = GTExWB[which((GTExWB$trim==TRUE) & 
                              (GTExWB$perm==FALSE) & 
                              (GTExWB$MODEL=="cseqtl") & 
                              (!is.na(GTExWB$PVAL))), ]

plist = list()

k = 1
for (i in 1:length(cs_cell_types)){
  
  cct = cs_cell_types[i]
  df_GTEx_i = GTExWB[which(GTExWB$CELLTYPE == cct), ]
  
  for (ct_ot in ct_groups[[cct]]){
    mat_i = match(eGenes_overlap_list[[ct_ot]][[1]]$CSeQTL, df_GTEx_i$ENSG)
    stopifnot(sum(is.na(mat_i)) == 0)
    
    df_GTEx = df_GTEx_i[mat_i,]
    df_GTEx$sc_eGene = df_GTEx$ENSG %in% eGenes_overlap_list[[ct_ot]][[1]]$Yazar
    table(df_GTEx$sc_eGene)
    
    ks1 = ks.test(log10(df_GTEx$MUa[!df_GTEx$sc_eGene]), 
                  log10(df_GTEx$MUa[df_GTEx$sc_eGene]))
    
    plist[[k]] = ggplot(df_GTEx, 
                        aes(x=sc_eGene, y=log10(MUa), color=sc_eGene)) + 
      geom_violin(trim=FALSE) + geom_boxplot(width=0.1) + 
      ggtitle(sprintf("%s, %d vs. %d", ct_ot, 
                      sum(!df_GTEx$sc_eGene), 
                      sum(df_GTEx$sc_eGene)), 
              subtitle=sprintf("KS test p-val=%.1e", ks1$p.value))
    k = k + 1
    
    df_GTEx$qvalue[which(df_GTEx$qvalue < 1e-50)] = 1e-50
    
    ks1 = ks.test(-log10(df_GTEx$qvalue)[!df_GTEx$sc_eGene], 
                  -log10(df_GTEx$qvalue)[df_GTEx$sc_eGene])
  
    plist[[k]] = ggplot(df_GTEx, aes(x=sc_eGene, 
                                     y=-log10(qvalue), 
                                     color=sc_eGene)) + 
      geom_violin(trim=FALSE) + geom_boxplot(width=0.1) + 
      ggtitle(label="", subtitle = sprintf("KS test p-val=%.1e", ks1$p.value))
    k = k + 1

    ks1 = ks.test(abs(log10(df_GTEx$ETA))[!df_GTEx$sc_eGene], 
                  abs(log10(df_GTEx$ETA))[df_GTEx$sc_eGene])
  
    plist[[k]] = ggplot(df_GTEx, aes(x=sc_eGene, 
                                     y=abs(log10(ETA)), 
                                     color=sc_eGene)) + 
      geom_violin(trim=FALSE) + geom_boxplot(width=0.1) + 
      ggtitle(label="", subtitle = sprintf("KS test p-val=%.1e", ks1$p.value))
    k = k + 1
  }
}

length(plist)
## [1] 33
ggarrange(plotlist=plist[1:12], nrow = 4, ncol = 3, 
          common.legend=TRUE, legend="top")

ggarrange(plotlist=plist[13:21], nrow = 4, ncol = 3, 
          common.legend=TRUE, legend="top")

ggarrange(plotlist=plist[22:33], nrow = 4, ncol = 3, 
          common.legend=TRUE, legend="top")

Session information

gc()
##           used (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells 1279517 68.4    3087424 164.9         NA  3087424 164.9
## Vcells 4324436 33.0   18052722 137.8      32768 22565820 172.2
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] reshape2_1.4.4    dplyr_1.0.7       ggpubr_0.4.0      ggplot2_3.4.0    
## [5] stringr_1.4.0     readxl_1.3.1      data.table_1.14.2
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.1.1 xfun_0.28        bslib_0.3.1      purrr_0.3.4     
##  [5] carData_3.0-4    colorspace_2.0-2 vctrs_0.5.1      generics_0.1.1  
##  [9] htmltools_0.5.2  yaml_2.2.1       utf8_1.2.2       rlang_1.0.6     
## [13] jquerylib_0.1.4  pillar_1.6.4     glue_1.6.2       withr_2.5.0     
## [17] DBI_1.1.1        plyr_1.8.6       lifecycle_1.0.3  munsell_0.5.0   
## [21] ggsignif_0.6.3   gtable_0.3.0     cellranger_1.1.0 evaluate_0.14   
## [25] labeling_0.4.2   knitr_1.36       fastmap_1.1.0    fansi_0.5.0     
## [29] highr_0.9        broom_0.7.10     Rcpp_1.0.7       scales_1.2.1    
## [33] backports_1.4.0  jsonlite_1.7.2   abind_1.4-5      farver_2.1.0    
## [37] gridExtra_2.3    digest_0.6.29    stringi_1.7.6    rstatix_0.7.0   
## [41] cowplot_1.1.1    grid_4.1.2       cli_3.4.1        tools_4.1.2     
## [45] magrittr_2.0.1   sass_0.4.0       tibble_3.1.6     crayon_1.4.2    
## [49] tidyr_1.1.4      car_3.0-12       pkgconfig_2.0.3  ellipsis_0.3.2  
## [53] assertthat_0.2.1 rmarkdown_2.11   rstudioapi_0.13  R6_2.5.1        
## [57] compiler_4.1.2